In [6]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [24]:
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from collections import Counter
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import auc, roc_auc_score, accuracy_score, roc_curve
from sklearn.preprocessing import LabelBinarizer
from util.isoelectric_point import isoelectric_points
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
In [8]:
# Load the sequences into memory
sequences = [s for s in SeqIO.parse('data/20160127_HA_prediction.fasta', 'fasta') if len(s.seq) == 566] # we are cheating and not bothering with an alignment.
len(sequences)
Out[8]:
In [9]:
# Load the sequence IDs into memory
seqids = [s.id for s in SeqIO.parse('data/20160127_HA_prediction.fasta', 'fasta') if len(s.seq) == 566]
len(seqids)
Out[9]:
In [10]:
# Cast the sequences as a MultipleSeqAlignment object, and then turn that into a pandas DataFrame.
# Note: this cell takes a while.
seq_aln = MultipleSeqAlignment(sequences)
seq_df = pd.DataFrame(np.array(seq_aln))
seq_df.head()
Out[10]:
In [11]:
# Transform the df into isoelectric point features.
seq_feats = seq_df.replace(isoelectric_points.keys(), isoelectric_points.values())
seq_feats.index = seqids
seq_feats.head()
Out[11]:
In [12]:
# Quick check to make sure that we have no strings:
for c in seq_feats.columns:
letters = set(seq_feats[c])
for item in letters:
assert not isinstance(item, str)
In [13]:
# Let us now load our labels.
labels = pd.read_csv('data/20160127_HA_prediction.csv', parse_dates=['Collection Date'])
labels['Host Species'] = labels['Host Species'].str.replace('IRD:', '').str.replace('/Avian', '')
labels['Sequence Accession'] = labels['Sequence Accession'].str.replace('*', '')
labels.set_index('Sequence Accession', inplace=True)
labels.head()
Out[13]:
In [14]:
# Let's join in the labels so that we have everything in one big massive table.
data_matrix = seq_feats.join(labels['Host Species'], how='inner')
data_matrix.head()
Out[14]:
In [15]:
# Quickly inspect the different labels under "host species"
# set(data_matrix['Host Species'])
In [16]:
# We will want to predict the labels for: "Avian", "Bird", "Environment", "Unknown", "null"
unknown_labels = ['Avian', 'Bird', 'Environment', 'Unknown', 'null']
known_labels = set(data_matrix['Host Species']) - set(unknown_labels)
In [17]:
# Let's further split the data into the "unknowns" and the "knowns"
unknowns = data_matrix[data_matrix['Host Species'].isin(unknown_labels)]
knowns = data_matrix[data_matrix['Host Species'].isin(known_labels)]
In [18]:
# Finally, we want to convert the known host species into a matrix of 1s and 0s, so that we can use them as inputs
# to the training algorithm.
lb = LabelBinarizer()
lb.fit([s for s in known_labels])
lb.transform(knowns['Host Species']) # note: this has not done anything to the original data.
Out[18]:
We're almost ready for training a machine learning model to classify the unknown hosts based on their sequence.
Here's the proper procedure.
This procedure is known as cross-validation, and is a powerful, yet cheap & easy method for evaluating how good a particular supervised learning model works.
In [19]:
# Split the data into a training and testing set.
X_cols = [i for i in range(0,566)]
X = knowns[X_cols]
Y = lb.transform(knowns['Host Species'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
In [20]:
# Train a Random Forest Classifier.
# Note: This cell takes a while; any questions?
# Initialize the classifier object.
clf = RandomForestClassifier()
# Train (i.e. "fit") the classifier to the training Xs and Ys
clf.fit(X_train, Y_train)
# Make predictions on the test X
preds = clf.predict(X_test)
preds
Out[20]:
In [21]:
lb.inverse_transform(preds)
Out[21]:
In [22]:
# Let's first take a look at the accuracy score: the fraction that were classified correctly.
accuracy_score(lb.inverse_transform(Y_test), lb.inverse_transform(preds))
Out[22]:
In [23]:
unknown_preds = clf.predict(unknowns[X_cols]) # make predictions; note: these are still dummy-encoded.
unknown_preds = lb.inverse_transform(unknown_preds) # convert dummy-encodings back to string labels.
unknown_preds
Out[23]:
What this gives us is the class label with the highest probability of being the correct one.
While we will not do this here, at this point, it would be a good idea to double-check your work with a sanity check. Are the sequences that are predicted to be Human truly of a close sequence similarity to actual Human sequences? You may want to do a Multiple Sequence Alignment, or you might want to simply compute the Levenshtein or Hamming distance between the two sequences, as a sanity check.
Depending on the classifier used, you can peer inside the model to get a feel for what the classifier learned about the features that best predict the class label.
The RandomForestClassifier provides a feature_importances_ attribute that we can access and plot.
In [25]:
plt.plot(clf.feature_importances_)
Out[25]:
The interpretation here is that the positions with greater "feature importance" were better at predicting the host class.
In [ ]:
In [ ]:
In [ ]:
In [ ]: